Lab 5

Joining Data

library(tidyverse)
## -- Attaching packages ---------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0     v purrr   0.3.4
## v tibble  3.0.0     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
Confirmed_State_3_13 <-   read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/03-13-2020.csv")) %>%
  rename(Country_Region = "Country/Region", Province_State = "Province/State") %>% 
  filter (Country_Region == "US") %>% 
  group_by(Province_State, Country_Region) %>% 
  summarise(Confirmed = sum(Confirmed)) 
## Parsed with column specification:
## cols(
##   `Province/State` = col_character(),
##   `Country/Region` = col_character(),
##   `Last Update` = col_datetime(format = ""),
##   Confirmed = col_double(),
##   Deaths = col_double(),
##   Recovered = col_double(),
##   Latitude = col_double(),
##   Longitude = col_double()
## )
## `summarise()` regrouping output by 'Province_State' (override with `.groups` argument)
Confirmed_State_9_13 <-   read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/09-13-2020.csv")) %>% 
  filter (Country_Region == "US") %>% 
  group_by(Province_State, Country_Region) %>% 
  summarise(Confirmed = sum(Confirmed)) 
## Parsed with column specification:
## cols(
##   FIPS = col_double(),
##   Admin2 = col_character(),
##   Province_State = col_character(),
##   Country_Region = col_character(),
##   Last_Update = col_datetime(format = ""),
##   Lat = col_double(),
##   Long_ = col_double(),
##   Confirmed = col_double(),
##   Deaths = col_double(),
##   Recovered = col_double(),
##   Active = col_double(),
##   Combined_Key = col_character(),
##   Incidence_Rate = col_double(),
##   `Case-Fatality_Ratio` = col_double()
## )
## `summarise()` regrouping output by 'Province_State' (override with `.groups` argument)
setdiff(Confirmed_State_9_13$Province_State, Confirmed_State_3_13$Province_State)
## [1] "Guam"                     "Northern Mariana Islands"
## [3] "Puerto Rico"              "Recovered"               
## [5] "Virgin Islands"
Confirmed_State_3_13_9_13_joined <- full_join(Confirmed_State_3_13,
      Confirmed_State_9_13, by = c("Province_State"))
Confirmed_State_3_13_9_13_joined <- full_join(Confirmed_State_3_13,
      Confirmed_State_9_13, by = c("Province_State")) %>% 
      rename(Confirmed_3_13_2020 = "Confirmed.x", Confirmed_9_13_2020 = "Confirmed.y") %>% 
      select(-Country_Region.x, -Country_Region.y) %>% 
      replace_na(list(Confirmed_3_13_2020 = 0))
which(is.na(Confirmed_State_3_13_9_13_joined))
## integer(0)
Confirmed_State_3_13_9_13_joined_long <- Confirmed_State_3_13_9_13_joined %>% 
              pivot_longer(-c(Province_State),
                            names_to = "Date", values_to = "Confirmed")
ggplot(Confirmed_State_3_13_9_13_joined_long, aes(x = Confirmed,  y = Province_State))  + 
    geom_point(aes(color = Date)) 

Working with Time Series Data

time_series_confirmed <- read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv")) %>%
  rename(Province_State = "Province/State", Country_Region = "Country/Region")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `Province/State` = col_character(),
##   `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
time_series_confirmed_long <- time_series_confirmed %>% 
               pivot_longer(-c(Province_State, Country_Region, Lat, Long),
                            names_to = "Date", values_to = "Confirmed") 
 download.file(url="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv", 
               destfile = "~/UMass/time_series_covid19_deaths_global.csv")
 time_series_deaths <- read_csv("~/UMass/time_series_covid19_deaths_global.csv")%>%
  rename(Province_State = "Province/State", Country_Region = "Country/Region")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `Province/State` = col_character(),
##   `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
 time_series_deaths_long <- time_series_deaths %>% 
               pivot_longer(-c(Province_State, Country_Region, Lat, Long),
                            names_to = "Date", values_to = "Deaths") 
time_series_confirmed_long <- time_series_confirmed_long %>% 
  unite(Key, Province_State, Country_Region, Date, sep = ".", remove = FALSE)
time_series_deaths_long <- time_series_deaths_long %>% 
  unite(Key, Province_State, Country_Region, Date, sep = ".") %>% 
  select(Key, Deaths)
time_series_long_joined <- full_join(time_series_confirmed_long,
              time_series_deaths_long, by = c("Key")) %>% 
              select(-Key)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
time_series_long_joined$Date <- mdy(time_series_long_joined$Date)
time_series_long_joined_counts <- time_series_long_joined %>% 
  pivot_longer(-c(Province_State, Country_Region, Lat, Long, Date),
               names_to = "Report_Type", values_to = "Counts")

Making Graphs from Time Series Data

time_series_long_joined %>% 
  group_by(Country_Region,Date) %>% 
  summarise_at(c("Confirmed", "Deaths"), sum) %>% 
  filter (Country_Region == "US") %>% 
    ggplot(aes(x = Date,  y = Deaths)) + 
    geom_point() +
    geom_line() +
    ggtitle("US COVID-19 Deaths")

time_series_long_joined %>% 
  group_by(Country_Region,Date) %>% 
  summarise_at(c("Confirmed", "Deaths"), sum) %>% 
  filter (Country_Region %in% c("China","Japan", "Korea, South",
                                "Italy","Spain", "US")) %>% 
    ggplot(aes(x = Date,  y = Deaths)) + 
    geom_point() +
    geom_line() +
    ggtitle("COVID-19 Deaths") +
    facet_wrap(~Country_Region, ncol=2, scales="free_y")

time_series_long_joined %>% 
    group_by(Country_Region,Date) %>% 
    summarise_at(c("Confirmed", "Deaths"), sum) %>% 
    filter (Country_Region %in% c("China","France","Italy", 
                                "Korea, South", "US")) %>% 
    ggplot(aes(x = Date,  y = Deaths, color = Country_Region)) + 
    geom_point() +
    geom_line() +
    ggtitle("COVID-19 Deaths")

time_series_long_joined_counts %>% 
  group_by(Country_Region, Report_Type, Date) %>% 
  summarise(Counts = sum(Counts)) %>% 
  filter (Country_Region == "US") %>% 
    ggplot(aes(x = Date,  y = log2(Counts), fill = Report_Type, color = Report_Type)) + 
    geom_point() +
    geom_line() +
    ggtitle("US COVID-19 Cases")
## `summarise()` regrouping output by 'Country_Region', 'Report_Type' (override with `.groups` argument)

## Repeating the Process with 6/13/20

Confirmed_State_6_13 <-   read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/06-13-2020.csv")) %>% 
  filter (Country_Region == "US") %>% 
  group_by(Province_State, Country_Region) %>% 
  summarise(Confirmed = sum(Confirmed)) 
## Parsed with column specification:
## cols(
##   FIPS = col_double(),
##   Admin2 = col_character(),
##   Province_State = col_character(),
##   Country_Region = col_character(),
##   Last_Update = col_datetime(format = ""),
##   Lat = col_double(),
##   Long_ = col_double(),
##   Confirmed = col_double(),
##   Deaths = col_double(),
##   Recovered = col_double(),
##   Active = col_double(),
##   Combined_Key = col_character(),
##   Incidence_Rate = col_double(),
##   `Case-Fatality_Ratio` = col_double()
## )
## `summarise()` regrouping output by 'Province_State' (override with `.groups` argument)
Confirmed_State_9_13 <-   read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/09-13-2020.csv")) %>% 
  filter (Country_Region == "US") %>% 
  group_by(Province_State, Country_Region) %>% 
  summarise(Confirmed = sum(Confirmed)) 
## Parsed with column specification:
## cols(
##   FIPS = col_double(),
##   Admin2 = col_character(),
##   Province_State = col_character(),
##   Country_Region = col_character(),
##   Last_Update = col_datetime(format = ""),
##   Lat = col_double(),
##   Long_ = col_double(),
##   Confirmed = col_double(),
##   Deaths = col_double(),
##   Recovered = col_double(),
##   Active = col_double(),
##   Combined_Key = col_character(),
##   Incidence_Rate = col_double(),
##   `Case-Fatality_Ratio` = col_double()
## )
## `summarise()` regrouping output by 'Province_State' (override with `.groups` argument)
setdiff(Confirmed_State_9_13$Province_State, Confirmed_State_6_13$Province_State)
## character(0)
Confirmed_State_6_13_9_13_joined <- full_join(Confirmed_State_6_13,
      Confirmed_State_9_13, by = c("Province_State"))
Confirmed_State_6_13_9_13_joined <- full_join(Confirmed_State_6_13,
      Confirmed_State_9_13, by = c("Province_State")) %>% 
      rename(Confirmed_6_13_2020 = "Confirmed.x", Confirmed_9_13_2020 = "Confirmed.y") %>% 
      select(-Country_Region.x, -Country_Region.y) %>% 
      replace_na(list(Confirmed_6_13_2020 = 0))
which(is.na(Confirmed_State_6_13_9_13_joined))
## integer(0)
Confirmed_State_6_13_9_13_joined_long <- Confirmed_State_6_13_9_13_joined %>% 
              pivot_longer(-c(Province_State),
                            names_to = "Date", values_to = "Confirmed")
ggplot(Confirmed_State_6_13_9_13_joined_long, aes(x = Confirmed,  y = Province_State))  + 
    geom_bar(stat = 'identity', aes(color = Date)) +
  labs(title = "Confirmed Cases by State", x = "Confirmed COVID Cases", y = "State")

Making a Plot for Confirmed Deaths/Day

time_series_long_joined %>% 
  group_by(Date) %>% 
  summarise_at(c("Confirmed", "Deaths"), sum) %>% 
    ggplot(aes(x = Date,  y = Deaths)) + 
    geom_point() +
    geom_line() +
    ggtitle("Global COVID-19 Deaths")

Lab Extra

Data for Lab

library(tidyverse)
library(lubridate)
time_series_confirmed_long <- read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv")) %>%
  rename(Province_State = "Province/State", Country_Region = "Country/Region")  %>% 
               pivot_longer(-c(Province_State, Country_Region, Lat, Long),
                             names_to = "Date", values_to = "Confirmed") 
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `Province/State` = col_character(),
##   `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
# Let's get the times series data for deaths
time_series_deaths_long <- read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv")) %>%
  rename(Province_State = "Province/State", Country_Region = "Country/Region")  %>% 
  pivot_longer(-c(Province_State, Country_Region, Lat, Long),
               names_to = "Date", values_to = "Deaths")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `Province/State` = col_character(),
##   `Country/Region` = col_character()
## )
## See spec(...) for full column specifications.
# Create Keys 
time_series_confirmed_long <- time_series_confirmed_long %>% 
  unite(Key, Province_State, Country_Region, Date, sep = ".", remove = FALSE)
time_series_deaths_long <- time_series_deaths_long %>% 
  unite(Key, Province_State, Country_Region, Date, sep = ".") %>% 
  select(Key, Deaths)
# Join tables
time_series_long_joined <- full_join(time_series_confirmed_long,
    time_series_deaths_long, by = c("Key")) %>% 
    select(-Key)
# Reformat the data
time_series_long_joined$Date <- mdy(time_series_long_joined$Date)
# Create Report table with counts
time_series_long_joined_counts <- time_series_long_joined %>% 
  pivot_longer(-c(Province_State, Country_Region, Lat, Long, Date),
               names_to = "Report_Type", values_to = "Counts")

Graphic Output

# Plot graph to a pdf outputfile
pdf("~/UMass/time_series_example_plot.pdf", width=6, height=5)
time_series_long_joined %>% 
  group_by(Country_Region,Date) %>% 
  summarise_at(c("Confirmed", "Deaths"), sum) %>% 
  filter (Country_Region == "US") %>% 
    ggplot(aes(x = Date,  y = Deaths)) + 
    geom_point() +
    geom_line() +
    ggtitle("US COVID-19 Deaths")
dev.off()
## png 
##   2
# Plot graph to a png outputfile
ppi <- 300
png("C:/Users/eliaj/OneDrive/Documents/UMass/time_series_example_plot.png", width=8*ppi, height=8*ppi, res=ppi)
time_series_long_joined %>% 
  group_by(Country_Region,Date) %>% 
  summarise_at(c("Confirmed", "Deaths"), sum) %>% 
  filter (Country_Region == "US") %>% 
    ggplot(aes(x = Date,  y = Deaths)) + 
    geom_point() +
    geom_line() +
    ggtitle("US COVID-19 Deaths")
dev.off()
## png 
##   2

RMarkdown Loading Images

US COVID-19 Deaths US COVID-19 Deaths ### Interactive Graphs

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
ggplotly(
  time_series_long_joined %>% 
    group_by(Country_Region,Date) %>% 
    summarise_at(c("Confirmed", "Deaths"), sum) %>% 
    filter (Country_Region == "US") %>% 
    ggplot(aes(x = Date,  y = Deaths)) + 
      geom_point() +
      geom_line() +
      ggtitle("US COVID-19 Deaths"))
time_series_deaths_per_confirmed <- time_series_long_joined %>% 
  filter(!is.na(Deaths), !is.na(Confirmed)) %>%
  mutate(deaths_per_confirmed = Deaths / Confirmed)
time_series_deaths_per_confirmed %>% 
  group_by(Country_Region,Date) %>% 
    summarise_at(c("Confirmed", "Deaths"), sum) %>% 
    filter (Country_Region == "US") %>% 
  ggplot(aes(x = Date, y = Deaths / Confirmed)) +
  geom_point() +
  geom_line()

  ggtitle("US COVID-19 Deaths/Confirmed")
## $title
## [1] "US COVID-19 Deaths/Confirmed"
## 
## attr(,"class")
## [1] "labels"
  time_series_deaths_per_confirmed %>% 
  group_by(Country_Region,Date) %>% 
    summarise_at(c("Confirmed", "Deaths"), sum) %>% 
    filter (Country_Region %in% c("Brazil","India","Mexico","UK","US","Italy","Peru", "France", "Spain", "Iran")) %>% 
  ggplot(aes(x = Date, y = Deaths / Confirmed, color = Country_Region)) +
  geom_line() 
## Warning: Removed 173 row(s) containing missing values (geom_path).

  ggtitle("Top 10 Global COVID-19 Deaths/Confirmed")
## $title
## [1] "Top 10 Global COVID-19 Deaths/Confirmed"
## 
## attr(,"class")
## [1] "labels"

Using Facet Wrap

 time_series_deaths_per_confirmed %>% 
  group_by(Country_Region,Date) %>% 
    summarise_at(c("Confirmed", "Deaths"), sum) %>% 
    filter (Country_Region %in% c("Brazil","India","Mexico","UK","US","Italy","Peru", "France", "Spain", "Iran")) %>% 
  ggplot(aes(x = Date, y = Deaths / Confirmed, color = Country_Region)) +
  geom_line() +
   facet_wrap(~Country_Region) +
  theme_dark()
## Warning: Removed 173 row(s) containing missing values (geom_path).

  ggtitle("Top 10 Global COVID-19 Deaths/Confirmed")
## $title
## [1] "Top 10 Global COVID-19 Deaths/Confirmed"
## 
## attr(,"class")
## [1] "labels"
time_series_confirmed_long_states <- time_series_confirmed_long %>% 
  group_by(Province_State, Date) %>% 
  filter(Country_Region == "US") %>% 
  summarise_at(c("Confirmed"), sum)
    ggplot(time_series_confirmed_long_states, aes(x = Date,  y = Confirmed)) + 
    geom_line() +
    ggtitle("US COVID-19 Confirmed")
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?

    facet_wrap(~Province_State)
## <ggproto object: Class FacetWrap, Facet, gg>
##     compute_layout: function
##     draw_back: function
##     draw_front: function
##     draw_labels: function
##     draw_panels: function
##     finish_data: function
##     init_scales: function
##     map_data: function
##     params: list
##     setup_data: function
##     setup_params: function
##     shrink: TRUE
##     train_scales: function
##     vars: function
##     super:  <ggproto object: Class FacetWrap, Facet, gg>

Animated Graphs

library(gganimate)
library(transformr)
data_time <- time_series_long_joined %>% 
    group_by(Country_Region,Date) %>% 
    summarise_at(c("Confirmed", "Deaths"), sum) %>% 
    filter (Country_Region %in% c("China","Korea, South","Japan","Italy","US")) 
p <- ggplot(data_time, aes(x = Date,  y = Confirmed, color = Country_Region)) + 
      geom_point() +
      geom_line() +
      ggtitle("Confirmed COVID-19 Cases") +
      geom_point(aes(group = seq_along(Date))) +
      transition_reveal(Date)

data_time <- time_series_long_joined %>% 
    group_by(Country_Region,Date) %>% 
    summarise_at(c("Confirmed", "Deaths"), sum) %>% 
    filter (Country_Region %in% c("China","Korea, South","Japan","Italy","US")) 
p <- ggplot(data_time, aes(x = Date,  y = Confirmed, color = Country_Region)) + 
      geom_point() +
      geom_line() +
      ggtitle("Confirmed COVID-19 Cases") +
      geom_point(aes(group = seq_along(Date))) +
      transition_reveal(Date) 
anim_save("deaths_5_countries.gif", p)